While there are a small number of threatened species with a large range, it is clear that Range is likely a strong predictor of Group status
A lower range predicts a higher likelihood of threatened or extinct grouping.
corrDF <- train %>% mutate(Range=ntile(Range, n=20))
corrDF %<>% mutate_at(c("Group","LF","GF","Biomes","Range","Habitat_degradation","Habitat_loss",
"IAS","Other","Over_exploitation","Pollution","Unknown"),factor)
corrDF <- corrDF %>% mutate(Range=ntile(Range, n=20))
corrplot::corrplot(DescTools::PairApply(corrDF,DescTools::CramerV), type='lower')Group Counts Pre-Balancing: 490 148 23
Group Counts Post-Balancing: 490 490 490
data_train_AB <- data_train
data_train_AB <- data_train_AB[data_train_AB$label
!= '3',]
data_train_AB_resampled <- ovun.sample(label ~ .,
data = data_train_AB, method = "over",
N = 980, seed = 1)$data
data_train_AC <- data_train
data_train_AC <- data_train_AC[data_train_AC$label
!= '2',]
data_train_AC_resampled <- ovun.sample(label ~ .,
data = data_train_AC,
method = "over", N = 980,
seed = 1)$data
data_train_AB_2 <- data_train_AB_resampled[data_train_AB_resampled$label
== '2',]
data_train_AC_3 <- data_train_AC_resampled[data_train_AC_resampled$label
== '3',]
data_train_1 <- data_train_AB_resampled[data_train_AB_resampled$label
== '1',]
data_train_combined <- rbind(data_train_1, data_train_AB_2, data_train_AC_3)
cat("Group Counts Pre-Balancing: ",table(data_train$label),
"\nGroup Counts Post-Balancing: ",table(data_train_combined$label))\(X_{new}=\frac{X_{old}-\min(X_{old})}{\max(X_{old})-\min(x_{old})}\)
Score
Accuracy 0.88
Recall 0.79
Precision 0.82
F1 0.81
features_train_1 <- as.data.frame(lapply(features_train,
function(x) {(x-min(x))/(max(x)-min(x))}))
features_test_1 <- as.data.frame(lapply(features_test,
function(x) {(x-min(x))/(max(x)-min(x))}))
data_train_1 <- features_train_1
data_train_1$label <- label
class_counts_1 <- table(data_train_1$label)
model_1 <- randomForest(x = data_train_1[-ncol(data_train_combined)],
y = as.factor(data_train_1$label), ntree = 2)
variable_importance_1 = importance(model_1)
pred_comb_1 <- predict(model_1, features_test_1)
accuracy <- sum(label_test == pred_comb_1) / length(label_test)
label_test_factor <- as.factor(label_test)
pred_comb_1_factor <- as.factor(pred_comb_1)
cm <- confusionMatrix(pred_comb_1_factor, label_test_factor)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"],
cm$byClass["Class: 2", "Sensitivity"],
cm$byClass["Class: 3", "Sensitivity"]))
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"],
cm$byClass["Class: 2", "Pos Pred Value"],
cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),
round(precision,2),round(F1,2)),
ncol=1,byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall',
'Precision','F1')
print(printTable)\(X_{new}=\frac{X_{old}-\bar{X}_{old}}{\sigma_{X_{old}}}\)
Score
Accuracy 0.40
Recall 0.53
Precision 0.41
F1 0.46
features_train_2 <- as.data.frame(lapply(features_train,
function(x) {(x - mean(x))/sd(x)}))
features_test_2 <- as.data.frame(lapply(features_test,
function(x) {(x - mean(x))/sd(x)}))
data_train_2 <- features_train_2
data_train_2$label <- label
class_counts_2 <- table(data_train_2$label)
model_2 <- randomForest(x = data_train_2[-ncol(data_train_combined)],
y = as.factor(data_train_2$label), ntree = 2)
variable_importance_2 = importance(model_2)
pred_comb_2 <- predict(model_2, features_test_2)
accuracy <- sum(label_test == pred_comb_2) / length(label_test)
label_test_factor <- as.factor(label_test)
pred_comb_1_factor <- as.factor(pred_comb_2)
cm <- confusionMatrix(pred_comb_1_factor, label_test_factor)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"],
cm$byClass["Class: 2", "Sensitivity"],
cm$byClass["Class: 3", "Sensitivity"]))
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"],
cm$byClass["Class: 2", "Pos Pred Value"],
cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),
round(precision,2),round(F1,2)),
ncol=1,byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall','Precision','F1')
print(printTable)\(X_{new}=\frac{X_{old}}{\max(|X_{old}|)}\)
Score
Accuracy 0.88
Recall 0.71
Precision 0.79
F1 0.75
features_train_3 <- as.data.frame(lapply(features_train,
function(x) {x / max(abs(x))}))
features_test_3 <- as.data.frame(lapply(features_test,
function(x) {x / max(abs(x))}))
data_train_3 <- features_train_3
data_train_3$label <- label
class_counts_3 <- table(data_train_3$label)
model_3 <- randomForest(x = data_train_3[-ncol(data_train_combined)],
y = as.factor(data_train_3$label),
ntree = 2)
variable_importance_3 = importance(model_3)
pred_comb_3 <- predict(model_3, features_test_3)
accuracy <- sum(label_test == pred_comb_3) / length(label_test)
label_test_factor <- as.factor(label_test)
pred_comb_1_factor <- as.factor(pred_comb_3)
cm <- confusionMatrix(pred_comb_1_factor, label_test_factor)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"],
cm$byClass["Class: 2", "Sensitivity"],
cm$byClass["Class: 3", "Sensitivity"]))
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"],
cm$byClass["Class: 2", "Pos Pred Value"],
cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),
round(precision,2),round(F1,2)),
ncol=1,byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall','Precision','F1')
print(printTable)\(X_{new}=\frac{X_{old}-\min(X_{old})}{\max(X_{old})-\min(x_{old})}\)
Score
Accuracy 0.77
Recall 0.57
Precision 0.54
F1 0.56
features_train_4 <- as.data.frame(lapply(features_train,
function(x) {x / sum(abs(x))}))
features_test_4 <- as.data.frame(lapply(features_test,
function(x) {x / sum(abs(x))}))
data_train_4 <- features_train_4
data_train_4$label <- label
class_counts_4 <- table(data_train_4$label)
model_4 <- randomForest(x = data_train_4[-ncol(data_train_combined)],
y = as.factor(data_train_4$label), ntree = 2)
variable_importance_4 = importance(model_4)
pred_comb_4 <- predict(model_4, features_test_4)
accuracy <- sum(label_test == pred_comb_4) / length(label_test)
label_test_factor <- as.factor(label_test)
pred_comb_1_factor <- as.factor(pred_comb_4)
cm <- confusionMatrix(pred_comb_1_factor, label_test_factor)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"],
cm$byClass["Class: 2", "Sensitivity"],
cm$byClass["Class: 3", "Sensitivity"]))
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"],
cm$byClass["Class: 2", "Pos Pred Value"],
cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),
round(precision,2),round(F1,2)),
ncol=1,byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall','Precision','F1')
print(printTable)\(X_{new}=\frac{X_{old}-\bar{X}_{old}}{\sigma_{X_{old}}}\)
Score
Accuracy 0.75
Recall 0.43
Precision 0.49
F1 0.46
features_train_5 <- as.data.frame(lapply(features_train,
function(x) {x / sqrt(sum(x^2))}))
features_test_5 <- as.data.frame(lapply(features_test,
function(x) {x / sqrt(sum(x^2))}))
data_train_5 <- features_train_5
data_train_5$label <- label
class_counts_5 <- table(data_train_5$label)
model_5 <- randomForest(x = data_train_5[-ncol(data_train_combined)],
y = as.factor(data_train_5$label), ntree = 2)
variable_importance_5 = importance(model_5)
pred_comb_5 <- predict(model_5, features_test_5)
accuracy <- sum(label_test == pred_comb_5) / length(label_test)
label_test_factor <- as.factor(label_test)
pred_comb_1_factor <- as.factor(pred_comb_5)
cm <- confusionMatrix(pred_comb_1_factor, label_test_factor)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"],
cm$byClass["Class: 2", "Sensitivity"],
cm$byClass["Class: 3", "Sensitivity"]))
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"],
cm$byClass["Class: 2", "Pos Pred Value"],
cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),
round(precision,2),round(F1,2)),ncol=1,
byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall','Precision','F1')
print(printTable)n <- length(pred_comb_1)
final_pred <- rep(NA, n)
for(i in 1:n) {
preds <- c(pred_comb_1[i], pred_comb_2[i], pred_comb_3[i],
pred_comb_4[i], pred_comb_5[i])
final_pred[i] <- as.numeric(names(which.max(table(preds))))
}
importances_list <- list(variable_importance_1, variable_importance_2,
variable_importance_3, variable_importance_4,
variable_importance_5)
average_importance <- Reduce("+", importances_list) / length(importances_list)
print(average_importance)
accuracy <- sum(label_test == final_pred) / length(label_test)
print(paste('Accuracy of Voting method:', accuracy))
final_pred_factor <- as.factor(final_pred)
label_test_factor <- as.factor(label_test)
cm_vote <- confusionMatrix(final_pred_factor, label_test_factor)
sensitivity_class1 <- cm_vote$byClass["Class: 1", "Sensitivity"]
sensitivity_class2 <- cm_vote$byClass["Class: 2", "Sensitivity"]
sensitivity_class3 <- cm_vote$byClass["Class: 3", "Sensitivity"]
recall = (sensitivity_class1 + sensitivity_class2 + sensitivity_class3) / 3
print(paste('Recall :', recall))
precision_class1 <- cm_vote$byClass["Class: 1", "Pos Pred Value"]
precision_class2 <- cm_vote$byClass["Class: 2", "Pos Pred Value"]
precision_class3 <- cm_vote$byClass["Class: 3", "Pos Pred Value"]
precision = (precision_class1 + precision_class2 + precision_class3) / 3
print(paste('Precision :', precision))
F1 = 2 * recall * precision / ( recall + precision )
print(paste('F1 :', F1))\(\text{Accuracy}\)
\(\text{Recall}\)
\(\text{Precision}\)
\(\text{F1}\)
\(\frac{\sum{\left(\text{Actual Label} = \text{Predicted Label}\right)}}{\text{Label Count}}\)
\(\frac{\text{True Positives}}{\text{True Positives} + \text{False Negatives}}\)
\(\frac{\text{True Positives}}{\text{True Positives}+\text{False Positives}}\)
\(\frac{2*(\text{Precision}*\text{Recall})}{\text{Precision}+\text{Recall}}\)
Confusion Matrix and Statistics
Reference
Prediction 1 2 3
1 200 12 1
2 10 51 0
3 0 0 9
Overall Statistics
Accuracy : 0.9187
95% CI : (0.8805, 0.9478)
No Information Rate : 0.742
P-Value [Acc > NIR] : 3.035e-14
Kappa : 0.7929
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: 1 Class: 2 Class: 3
Sensitivity 0.9524 0.8095 0.90000
Specificity 0.8219 0.9545 1.00000
Pos Pred Value 0.9390 0.8361 1.00000
Neg Pred Value 0.8571 0.9459 0.99635
Prevalence 0.7420 0.2226 0.03534
Detection Rate 0.7067 0.1802 0.03180
Detection Prevalence 0.7527 0.2155 0.03180
Balanced Accuracy 0.8871 0.8820 0.95000
cm_d <- as.data.frame(cm$table)
cm_st <-data.frame(cm$overall)
cm_st$cm.overall <- round(cm_st$cm.overall,2)
cm_d$diag <- cm_d$Prediction == cm_d$Reference
cm_d$ndiag <- cm_d$Prediction != cm_d$Reference
cm_d[cm_d == 0] <- NA
cm_d$Reference <- reverse.levels(cm_d$Reference)
cm_d$ref_freq <- cm_d$Freq * ifelse(is.na(cm_d$diag),-1,1)
plt1 <- ggplot(data = cm_d, aes(x = Prediction , y = Reference,
fill = Freq))+
scale_x_discrete(position = "top") +
geom_tile( data = cm_d,aes(fill = ref_freq)) +
scale_fill_gradient2(guide = FALSE ,low="red",high="mediumvioletred",
mid= "mistyrose",
midpoint = 0,na.value = 'white') +
geom_text(aes(label = Freq), color = 'black', size = 3)+
theme_bw() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
legend.position = "none",
panel.border = element_blank(),
plot.background = element_blank(),
axis.line = element_blank(),
)
plt2 <- tableGrob(cm_st)
grid.arrange(plt1, plt2, nrow = 1, ncol = 2,
top=textGrob("Confusion Matrix",gp=gpar(fontsize=25,font=1)))